{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase1/analysis_results/shapeit2_phased_haplotypes/ALL.chr21.SHAPEIT2_integrated_phase1_v3.20101123.snps_indels_svs.genotypes.all.vcf.gz\n",
    "#ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase1/analysis_results/integrated_call_sets/integrated_call_samples.20101123.ped\n",
    "\n",
    "\n",
    "#https://mathgen.stats.ox.ac.uk/impute/scripts/vcf2impute_legend_haps\n",
    "#perl vcf2impute_legend_haps -vcf ALL.chr21.SHAPEIT2_integrated_phase1_v3.20101123.snps_indels_svs.genotypes.all.vcf.gz -leghap 21 -chr 21 -snps_only\n",
    "#python merge.py 21.hap.gz 21.legend.gz 21 > 21.merge\n",
    "#python clean_sample.py < 21.sample_list > 21.sample\n",
    "#impute_to_ped  21.merge 21.sample g21\n",
    "#germline  -input g21.ped 21.map -output good\n",
    "#gzip good.match.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "import gzip\n",
    "\n",
    "import scipy.stats as stats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "germline_file = 'good.match.gz'\n",
    "sample_file = 'integrated_call_samples.20101123.ped'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "inds = set()\n",
    "ind_pop = {}\n",
    "selected_inds = {}\n",
    "pop_inds = defaultdict(list)\n",
    "\n",
    "with gzip.open(germline_file, 'rt', encoding='utf-8') as f:\n",
    "    for l in f:\n",
    "        toks = l.rstrip().split()\n",
    "        inds.add(toks[1])\n",
    "        inds.add(toks[3])\n",
    "\n",
    "with open(sample_file, 'rt', encoding='utf-8') as f:\n",
    "    f.readline()  # header\n",
    "    for l in f:\n",
    "        toks = l.rstrip().split('\\t')\n",
    "        fam = toks[0]\n",
    "        ind = toks[1]\n",
    "        pop = toks[6]\n",
    "        if ind not in inds:\n",
    "            continue\n",
    "        selected_inds[fam] = ind  # We just want one per family\n",
    "        ind_pop[ind] = pop\n",
    "\n",
    "for ind, pop in ind_pop.items():\n",
    "    pop_inds[pop].append(ind)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "larger_shared = (0, None, None)\n",
    "sizes = []\n",
    "all_sizes = []\n",
    "shared = defaultdict(int)\n",
    "with gzip.open(germline_file, 'rt', encoding='utf-8') as f:\n",
    "    for l in f:\n",
    "        toks = l.rstrip().split()\n",
    "        ind1 = toks[1]\n",
    "        ind2 = toks[3]\n",
    "        start = int(toks[5])\n",
    "        end = int(toks[6])\n",
    "        size = float(toks[10])\n",
    "        all_sizes.append(size)\n",
    "        if start > 10000000 and end < 15000000:\n",
    "            continue\n",
    "        if size > larger_shared[0]:\n",
    "            larger_shared = (size, ind1, ind2)\n",
    "        shared[tuple(sorted((ind1, ind2)))] += size\n",
    "        sizes.append(size)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DescribeResult(nobs=43736, minmax=(3.008, 24.508), mean=3.4114706191695627, variance=0.05003760874003961, skewness=39.76406261037047, kurtosis=2745.5735275532415)\n",
      "DescribeResult(nobs=173, minmax=(3.008, 24.508), mean=4.792404624277457, variance=7.752351021373841, skewness=3.6276764010875655, kurtosis=17.516780728998093)\n"
     ]
    }
   ],
   "source": [
    "print(stats.describe(all_sizes))\n",
    "print(stats.describe(sizes))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(24.508, 'HG00501', 'HG00512')\n",
      "CHS CHS\n"
     ]
    }
   ],
   "source": [
    "print(larger_shared)\n",
    "print(ind_pop[larger_shared[1]], ind_pop[larger_shared[2]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "pop_shared = defaultdict(list)\n",
    "for inds, total_size in shared.items():\n",
    "    ind1, ind2 = inds\n",
    "    pop1 = ind_pop[ind1]\n",
    "    pop2 = ind_pop[ind2]\n",
    "    if pop1 == pop2:\n",
    "        pop_shared[pop1].append(total_size)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "PUR  55  28   4.05   3.82\n",
      "LWK  86  15   9.10   4.95\n",
      "GBR  89   7   5.66   5.99\n",
      "TSI  98   3   3.63   3.53\n",
      "FIN  91  38   3.97   3.63\n",
      "CHS  93  14  11.98  11.87\n",
      "CLM  59  12   3.89   3.43\n",
      "MXL  65  11   9.48   8.35\n",
      "ASW  50   2  16.61  16.61\n",
      "CEU  83   2   7.77   7.77\n",
      "CHB  88   1   3.82   3.82\n",
      "dict_keys(['CEU', 'ASW', 'MXL', 'CLM', 'GBR', 'FIN', 'IBS', 'YRI', 'CHB', 'JPT', 'LWK', 'TSI', 'PUR', 'CHS'])\n"
     ]
    }
   ],
   "source": [
    "for pop, total_sizes in pop_shared.items():\n",
    "    print(\"%s %3d %3d %*.2f %*.2f\" % (pop,\n",
    "                                    len(pop_inds[pop]), len(total_sizes),\n",
    "                                    6, scipy.mean(total_sizes),\n",
    "                                    6, scipy.median(total_sizes)))\n",
    "print(pop_inds.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
